In [ ]:
import polars as pl
import pint
import pyarrow
from great_tables import GT, html
import plotnine as pn
med_gran = pl.read_parquet("./data/emissions_medium_granularity.parquet")
ureg = pint.UnitRegistry()
ureg.define("Million_bbl = 1e6*bbl = Mbbl")
pn.theme_set(pn.theme_bw())
Out[ ]:
<plotnine.themes.theme_bw.theme_bw at 0x18b284a7a00>

Counts of records¶

Loaded up polars and the parquet file containing med-granularity production & emissions data from Carbon Majors (https://carbonmajors.org/Downloads). Let's take a peek at the data, first in terms of counts of rows from different parent types, commodities, and units.

In [ ]:
med_gran.describe()
slim_counts = med_gran.select(
    pl.struct(['parent_type', 'commodity', 'production_unit']).value_counts(sort=True)
    ).unnest('parent_type').unnest('parent_type')
print(slim_counts)
shape: (23, 4)
┌────────────────────────┬────────────────────┬────────────────────┬───────┐
│ parent_type            ┆ commodity          ┆ production_unit    ┆ count │
│ ---                    ┆ ---                ┆ ---                ┆ ---   │
│ str                    ┆ str                ┆ str                ┆ u32   │
╞════════════════════════╪════════════════════╪════════════════════╪═══════╡
│ Investor-owned Company ┆ Oil & NGL          ┆ Million bbl/yr     ┆ 1945  │
│ Investor-owned Company ┆ Natural Gas        ┆ Bcf/yr             ┆ 1802  │
│ State-owned Entity     ┆ Oil & NGL          ┆ Million bbl/yr     ┆ 1745  │
│ State-owned Entity     ┆ Natural Gas        ┆ Bcf/yr             ┆ 1617  │
│ Investor-owned Company ┆ Bituminous Coal    ┆ Million tonnes/yr  ┆ 677   │
│ …                      ┆ …                  ┆ …                  ┆ …     │
│ State-owned Entity     ┆ Metallurgical Coal ┆ Million tonnes/yr  ┆ 126   │
│ Nation State           ┆ Cement             ┆ Million Tonnes CO2 ┆ 95    │
│ Nation State           ┆ Oil & NGL          ┆ Million bbl/yr     ┆ 43    │
│ Investor-owned Company ┆ Anthracite Coal    ┆ Million tonnes/yr  ┆ 38    │
│ Nation State           ┆ Natural Gas        ┆ Bcf/yr             ┆ 33    │
└────────────────────────┴────────────────────┴────────────────────┴───────┘

Taking a quick look at how many records we have for each entity type and production unit.

In [ ]:
(
    pn.ggplot(slim_counts, pn.aes(x="parent_type", y="count", fill="production_unit"))
    + pn.geom_col()
).show()
No description has been provided for this image

Quantities produced, top-line¶

Let's instead aggregate the production records across all years and countries. We'll keep the different production units that Carbon Majors reports in. If they didn't want to attempt unit conversion, I won't either for purposes of this example. In particular, their methodology for calculating cement production (worth reading more into here) in terms of CO2 emissions complicates conversion.

In [ ]:
units_mapper = {
    "Million tonnes/yr": "MTS",
    "Million Tonnes CO2": "MTCO",
    "Million bbl/yr": "Mbbl",
    "Bcf/yr": "Bcf"
}

parent_enum = pl.Enum(
    ['Nation State', 'State-owned Entity', 'Investor-owned Company']
)

med_gran_agg = med_gran.group_by(['parent_type', 'commodity', 'production_unit']).agg(
    pl.col("production_value").sum().alias("sum_prod_val"),
    pl.col("total_emissions_MtCO2e").sum().alias("sum_mtco2e")
    ).with_columns(
        pl.col("production_unit").replace(units_mapper)
    ).sort(
        pl.col("production_unit", "commodity")
    ).with_columns(
        pl.col("parent_type").cast(parent_enum)
    )

(
    GT(med_gran_agg, rowname_col="commodity", groupname_col="parent_type")
    .tab_header("Total Carbon Majors production & emissions")
    .tab_stubhead("")
    .fmt_integer(['sum_prod_val', 'sum_mtco2e'])
    .cols_label(
        sum_prod_val="Produced / yr",
        sum_mtco2e=html("Emitted<br>(MtCO2e) / yr"),
        production_unit = "Units"
    )
)
Out[ ]:
Total Carbon Majors production & emissions
Units Produced / yr Emitted
(MtCO2e) / yr
Nation State
Natural Gas Bcf 335,678 24,468
Cement MTCO 46,323 23,161
Anthracite Coal MTS 16,179 47,205
Bituminous Coal MTS 101,775 276,231
Lignite Coal MTS 22,393 29,989
Metallurgical Coal MTS 24,553 72,839
Sub-Bituminous Coal MTS 2,095 4,230
Oil & NGL Mbbl 95,475 38,073
State-owned Entity
Natural Gas Bcf 1,778,095 129,606
Bituminous Coal MTS 7,745 21,022
Lignite Coal MTS 1,201 1,608
Metallurgical Coal MTS 202 600
Sub-Bituminous Coal MTS 14,460 29,196
Oil & NGL Mbbl 709,077 282,764
Investor-owned Company
Natural Gas Bcf 1,441,634 105,081
Cement MTCO 10,316 6,521
Anthracite Coal MTS 100 291
Bituminous Coal MTS 12,347 33,512
Lignite Coal MTS 8,284 11,095
Metallurgical Coal MTS 4,760 14,120
Sub-Bituminous Coal MTS 8,106 16,367
Thermal Coal MTS 19,302 45,734
Oil & NGL Mbbl 519,851 207,305
In [ ]:
parent_cmap = {
    'Nation State': '#A65041',
    'State-owned Entity': '#E7CDC2',
    'Investor-owned Company': '#80A0C7'
}

med_gran_viz = med_gran_agg.group_by(['parent_type', 'production_unit']).agg(pl.col("sum_prod_val").sum())

(
    pn.ggplot(med_gran_viz, pn.aes(x="parent_type", y="sum_prod_val", fill="parent_type"))
    + pn.facet_wrap("production_unit",scales="free_x")
    + pn.geom_col()
    + pn.coord_flip()
    + pn.scale_x_discrete(name="")
    + pn.scale_y_continuous(name="Units produced")
    + pn.scale_fill_manual(values = parent_cmap)
    + pn.guides(fill = "none")
    + pn.theme(panel_grid = pn.element_blank(),
        panel_grid_major_x = pn.element_line(color="#EBEBEB"))
)
No description has been provided for this image

Top-producing entities by decade¶

Let's filter down to only the "Oil & NGL" group and keep only the entities that make it into a top 5 (in any decade available in the data)

In [ ]:
top_ent_groups = ['parent_type', 'decade']

top_entities = (
    med_gran
    .filter(pl.col('production_unit').is_in(['Million bbl/yr']))
    .with_columns((pl.col('year') / 10).cast(pl.Int32).alias('decade'))
    .with_columns((pl.col('decade') * 10))
    .group_by(['parent_entity', 'parent_type', 'production_unit', 'decade'])
    .agg(
        pl.col("production_value").sum()
    )
    .sort(top_ent_groups)
    .filter(pl.int_range(0, pl.len()).over(top_ent_groups) < 5)
)

top_entities.head()
Out[ ]:
shape: (5, 5)
parent_entityparent_typeproduction_unitdecadeproduction_value
strstrstri32f64
"ExxonMobil""Investor-owned Company""Million bbl/yr"18802.412678
"Shell""Investor-owned Company""Million bbl/yr"189010.194514
"ExxonMobil""Investor-owned Company""Million bbl/yr"189025.54388
"Shell""Investor-owned Company""Million bbl/yr"190030.897365
"ExxonMobil""Investor-owned Company""Million bbl/yr"190069.458772
In [ ]:
(
    pn.ggplot(top_entities, pn.aes(x='decade', y='production_value', group='parent_entity'))
    + pn.geom_line(pn.aes(color='parent_type'), size=1.1, alpha = 0.618)
    + pn.scale_y_log10(name="Mbbl/yr")
    + pn.scale_color_manual(values = parent_cmap)
    + pn.guides(color = pn.guide_legend(position=(0.05,0.95), title=""))
    + pn.theme(panel_grid_minor = pn.element_blank(),
        panel_grid_major_x = pn.element_line(color="#EBEBEB"))
)
No description has been provided for this image

Now, recreating with interactive tool to get more info on the entities themselves. Exporting the interactive version is still tricky so for now we specify the renderer as shown and live with a larger-than-expected file.

In [ ]:
import plotly_express as px
import plotly.io as pio
pio.renderers.default = "notebook"

fig = px.line(
    top_entities,
    x = 'decade',
    y = 'production_value',
    color = 'parent_type',
    color_discrete_map=parent_cmap,
    line_group = 'parent_entity',
    log_y = True,
    width=700,
    labels={
        'decade': '',
        'production_value': 'Mbbl/yr'
    },
    title = "Top 5 Oil & NGL producers over the decades"
).update_layout(legend=dict(
    title="",
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
)).update_layout(
    plot_bgcolor="#f2f2f2"
)

fig.show()